/*
Compile the data to do the baseline analysis. 
This file generates both incidental mergers from the LBD and the compiled file analysis_data2007 
(among other data files) to be used in data_analyze.do file
*/

clear all
set more off

**************************************************************************
/*Generate Mergers and Incidental Mergers*/
**************************************************************************

/* Sometimes a plant's firm identifier will switch back and forth from one year to the next (i.e., it will be X in year t and t+2, 
but Y (where Y might be a missing value) in  year t+1.  This might make it seem that there are a lot more mergers and acquisitions 
than there actually are.  So, to avoid this overcounting of acquisitions, if a plant has the same firmid
at t-1 and t+1, we change the firmid at t to the same firmid as t-1 and t+1.  */

/* We say that an acquisition occurs if 
a) the establishment, i, changes hands between years t and t+1
b) the establishment, i, has the same firmid t+1 and t+2 (not required if year is 2006)
c) there exists some other firm in year t that has establishments that share the same firm identifier in year t+1 as i.
*/

* Generate 2002 to 2007 mergers
* generate incidental mergers

local construct_vi_industries=1 /* Indicator variable: Do you want to construct the mapping of pairs of vertically-related 6-digit NAICS industries? Note, this portion of the text draws on a dataset (sicTable01.csv ) that is constructed in the replication files of our earlier paper: "Vertical Integration and Input Flows" */
local compile_im=0  /* Indicator variable: Do you construct the part of the dataset which produces the incidental merger sample? */
local compile_rest=1 /* Indicator variable: Do you construct the rest of the dataset */
local compile_panel=1 /*Indicator for constructing 2002-2007 panel*/
local numseg=3   /* Maximum: how many segments to exclude when computing the incidental merger sample */
local var =  "pay"    /* Variable which defines the variable to sort on when computing firms' top segments */
local indvar = "naicsup"  /* Variable which defines the upstream industry */
local indvardown = "naicsdown" /* Variable which defines the downstream industry */

/* LBDXXX is simply the raw dataset of the Longitudinal Business Database from year = XXXX 
  without any processing. Same idea for the CMF or the CFS.*/

if `construct_vi_industries'==1 {
/* We construct our database of pairs of vertically related-NAICS industries 
   from our database of pairs of vertically-related SIC industries, the latter 
   database which was constructed as part of earlier paper ("Vertical Integration
   and Input Flows"). We first need a concordance between NAICS codes and SIC codes.
   To do so, we use the 1997 LBD in which establishments have both industry codes. */
  
use emp pay lbdnum firmid bestnaics bestsic if length(bestnaics)>1 & length(bestsic)>1 using lbd1997, replace
replace bestsic=substr(bestsic,1,4)
replace bestnaics=substr(bestnaics,1,6)
collapse (sum) pay, by(bestnaics bestsic)
drop if pay==0
bys bestsic: egen sum_pay_sic=sum(pay)
bys bestnaics: egen sum_pay_naics=sum(pay)
gen frac_pay_naics=pay/sum_pay_naics
keep if frac_pay_naics>0.001 
/* This line of code is not necessary (and doesn't impact the construction of naicsTableXX.csv below)
   but speeds up the "joinby" line of code, about 12 lines below. */
keep best* frac_pay_naics
destring bestsic, replace
save naics_sic_corresp, replace

insheet using sicTable01.csv , clear 
ren sicup bestsic
joinby bestsic using naics_sic_corresp 
replace frac=frac*frac_pay_naics
collapse (sum) frac, by(bestnaics sic4)
ren bestnaics naicsup
ren sic4 bestsic
joinby bestsic using naics_sic_corresp 
replace frac=frac*frac_pay_naics
collapse (sum) frac, by(bestnaics naicsup)
ren bestnaics naicsdown
bys naicsup: egen s1=sum(frac)
replace frac=frac/s1
drop s1
keep if frac>=0.01
outsheet using naicsTable10.csv, replace
keep if frac>=0.02
outsheet using naicsTable20.csv, replace
keep if frac>=0.04
outsheet using naicsTable40.csv, replace

}

/* "im" is the acronym for "incidental merger" */ 

if `compile_im'==1 {

use lbd2007, clear
append using lbd2002 lbd2003 lbd2004 lbd2005 lbd2006 lbd2008 lbd2009
gen na4=substr(bestnaics,1,4)
bys lbdnum: egen na4_temp=mode(na4)
replace na4=na4_temp if na4_temp~=""
rename yr year
keep lbdnum firmid na4 pay emp year
/*there are a lot of missing firmids across years, fix that first
replace missing firmid by previous period firmid if previous one period firmid == post one period firmid*/ 
sort lbdnum year
by lbdnum: replace firmid = firmid[_n-1] if  firmid[_n-1]==firmid[_n+1] & firmid[_n-1]~="" & firmid[_n+1]~=""

* if post one period and post two period have the same firmid, but different from the previous one period,
* assume the firmid is the same as the post one period.
by lbdnum: replace firmid=firmid[_n+1] if firmid=="" & firmid[_n+1]==firmid[_n+2] & firmid[_n+2]~="" & firmid[_n+1]~=""
count if firmid==""
save tempfirmid, replace

gen n=1
collapse (sum) n, by(firmid year)
rename n nest
sort firmid year
by firmid: gen nestpre=nest[_n-1]

merge 1:m firmid year using tempfirmid
drop _merge
save tempfirmid, replace

/* This is the portion of the code in which we compute the top segments of each firm according to their payroll in the year before the merger. */

collapse (sum) pay emp, by (na4 firmid year)
drop if na4==""
drop if `var'==0 
bys firmid year: gen num_seg=_N
gsort firmid year -`var'
by firmid year: gen rank=_n
keep na4 firmid num_seg year rank
drop if rank>`numseg'
reshape wide na4, i(firmid num_seg year) j(rank)
sort firmid year
by firmid: gen num_seg_pre=num_seg[_n-1]
by firmid: gen na41_pre=na41[_n-1]
by firmid: gen na42_pre=na42[_n-1]
by firmid: gen na43_pre=na43[_n-1]

merge 1:m firmid year using tempfirmid
drop _merge

sort lbdnum year  
by lbdnum: gen merger_all=1 if firmid~=firmid[_n+1] & firmid~="" & firmid[_n+1]~="" & nestpre[_n+1]~=.
by lbdnum: replace merger_all=. if firmid[_n+1]~=firmid[_n+2] & firmid[_n+1]~="" & firmid[_n+2]~=""

save firmids200209, replace
* Now save plantid that experienced merger and acquisition between 2002 to 2007
drop if year==2008 | year==2009
replace merger_all=0 if merger_all~=1
sort lbdnum year

* Some plants switch hands multiple times during 2002 to 2007
* use the last occurance (also make sure the last transaction is not sold back to the original firm in 2002)
by lbdnum: gen sind=sum(merger_all)
by lbdnum: egen sindmax=max(sind)
*list lbdnum if sind>1
by lbdnum: replace merger_all=. if merger_all==1 & sind<sindmax
save temp1, replace

use lbd2002, clear
keep lbdnum firmid
rename firmid firmid2002
merge 1:m lbdnum using temp1
drop if _merge==1
drop _merge
replace merger_all=. if merger_all~=1
gen mrgeyr=merger_all*(year)
save temp2, replace

use temp2, replace
sort lbdnum year
bys firmid: gen n=_N
/*All statistics for buyer firm needs to be from the merger years, prior the year of actual firmid change*/
sort lbdnum year
by lbdnum: gen buyer=firmid[_n+1]
by lbdnum: gen nestbuyer=nestpre[_n+1]
by lbdnum: gen buyer_num_seg=num_seg_pre[_n+1]
by lbdnum: gen buyerna4_1=na41_pre[_n+1]
by lbdnum: gen buyerna4_2=na42_pre[_n+1]
by lbdnum: gen buyerna4_3=na43_pre[_n+1]
/*All statistics for ownf (seller) are from the merger year. Merger year is the year prior the
actual change of firmid*/
rename firmid ownf
by lbdnum: gen nestownf=n
by lbdnum: gen ownf_num_seg=num_seg
by lbdnum: gen ownfna4_1=na41
by lbdnum: gen ownfna4_2=na42
by lbdnum: gen ownfna4_3=na43
rename na4 plantna4
drop if merger_all~=1
drop if buyer==firmid2002 & merger_all==1 & buyer~="" & firmid2002!=""
drop if buyer==""
drop year
keep buyer mrgeyr merger_all ownf lbdnum firmid2002 nest*  ///
buyer* ownf* plantna4
bys mrgeyr ownf buyer: gen num_purc=_N
save mergers_activity_2002_2007, replace

/*rm temp1.dta
rm temp2.dta
*/
  
/***********************************************************
		Generate Incidental Merger Indicator
************************************************************/

* find mergers outside top business segments of the firms

foreach segcutoff in 1 2 3{
use mergers_activity_2002_2007, clear
		if `segcutoff'==1{
gen outbuyer`segcutoff'=1 if plantna4~=buyerna4_1 & buyerna4_1~="" & plantna4~="" 
gen outownf`segcutoff'=1 if plantna4~=ownfna4_1 & ownfna4_1~="" & plantna4~="" 

		}
		if `segcutoff'==2{
gen outbuyer`segcutoff'=1 if plantna4~=buyerna4_1 & plantna4~=buyerna4_2 & buyerna4_1~="" & buyerna4_2~="" & plantna4~="" 
gen outownf`segcutoff'=1 if plantna4~=ownfna4_1 & plantna4~=ownfna4_2 & ownfna4_1~="" & ownfna4_2~="" & plantna4~="" 

		}
		if `segcutoff'==3{
gen outbuyer`segcutoff'=1 if plantna4~=buyerna4_1 & plantna4~=buyerna4_2 & plantna4~=buyerna4_3 & buyerna4_1~="" & buyerna4_2~="" & buyerna4_3~="" & plantna4~="" 
gen outownf`segcutoff'=1 if plantna4~=ownfna4_1 & plantna4~=ownfna4_2 & plantna4~=ownfna4_3 & ownfna4_1~="" & ownfna4_2~="" & ownfna4_3~="" & plantna4~="" 

		}
/*Need to have at least three  segments*/
save top`segcutoff'im_na0207, replace

}

use top1im_na0207, clear
keep lbdnum mrgeyr outbuyer1 outownf1 
merge 1:1 lbdnum mrgeyr using top3im_na0207
drop _merge
merge 1:1 lbdnum mrgeyr using top2im_na0207
drop _merge
keep out* lbdnum mrgeyr buyer ownf merger_all firmid2002 nest*  ///
buyer* ownf* plantna4 num_purc
destring lbdnum, force replace
gen num_purc_ind=1 if num_purc>=2
save imall_0207_4, replace

foreach segcutoff in 1 2 3{
use imall_0207_4, clear
gen seg_ind_`segcutoff'=1 if buyer_num_seg>=`numseg' & ownf_num_seg>=`numseg'
gen outboth`segcutoff'=1 if outbuyer`segcutoff'==1 & outownf`segcutoff'==1
bys buyer ownf mrgeyr: egen imb_`segcutoff'=sum(outbuyer`segcutoff')
bys buyer ownf mrgeyr: egen imo_`segcutoff'=sum(outownf`segcutoff')
bys buyer ownf mrgeyr: egen imbo_`segcutoff'=sum(outboth`segcutoff')
/* used to have imb_`segcutoff'<num_purc or imbo_`segcutoff'<num_purc */
gen imbuyer_`segcutoff'=1 if outbuyer`segcutoff'~=. & num_purc_ind==1 & seg_ind_`segcutoff'==1
gen imownf_`segcutoff'=1 if outownf`segcutoff'~=. & num_purc_ind==1 & seg_ind_`segcutoff'==1
gen imboth_`segcutoff'=1 if outboth`segcutoff'~=. &  num_purc_ind==1 & seg_ind_`segcutoff'==1
save imall_0207_4, replace
}

use lbd2007, clear
keep lbdnum
destring lbdnum, force replace
merge 1:1 lbdnum using imall_0207_4
drop if _merge!=3
drop _merge
save im_2007plants_4, replace

}

/* End of the compile_im loop */

***************************************************************************
* Generate the rest of the variables we need in regression
***************************************************************************

if `compile_rest'==1 {
local yr=2007
local pctlink=10   /* This variable takes either values 10, 20, or 30. It parameterizes the necessary value,
   what share of naics industry I's sales are sold to industry J's plants, 1 percent, 2 percent, or 3 percent,
   when defining I->J as a vertical linnk. The default value is 10 */
local preyr=`yr'-1

/*First, get naicsup from both lbd`yr' and lbd`preyr', some naicsup are missing in the
current year, so use the pre year. Also, some cfn are missing in the current year,
use the previous year to match with cfs*/

/* Here, we are imputing (for cases in which the variable is missing in year=2007) values of 
the variable from 2006. This sort of procedure is adopted throughout data_compile.do and 
data_analyze.do . Below, when using lbd20062007 (or, more generally, lbd`preyr'`yr'), we 
will only keep the most recent year for the given establishment.   

*/ 

use lbd`preyr', clear
append using lbd`yr'
gen naicsup=substr(bestnaics,1,6)
destring naicsup, replace
sort lbdnum yr
foreach v in bestnaics lbdnum cfn county zip{
	by lbdnum: replace `v'=`v'[_n-1] if `v'=="" & `v'[_n-1]!=""
}
foreach v in  mu emp pay naicsup {
	by lbdnum: replace `v'=`v'[_n-1] if `v'==. & `v'[_n-1]!=.
}
save lbd`preyr'`yr'naics, replace

use tempfirmid, replace
keep year firmid lbdnum pay emp
ren pay payF
ren emp empF
ren firmid firmidF
ren year yr
keep if yr==2006 | yr==2007
merge 1:1 lbdnum yr using lbd`preyr'`yr'naics
replace pay=payF if pay==. & payF~=.
replace emp=empF if emp==. & empF~=.
replace firmid=firmidF if firmidF~=""
drop payF empF _merge firmidF
save lbd`preyr'`yr'naics, replace

use tempfirmid, replace
keep year firmid lbdnum
ren firmid firmidT
save tempfirmid_less, replace

use  lbd`preyr'`yr'naics, replace
gen year=. 
forvalues x=2002/2007 {
  replace year=`x'
  merge n:1 year lbdnum using tempfirmid_less
  drop if _merge==2
  drop _merge
  ren firmidT firmid`x'
}
drop year
save lbd`preyr'`yr'naics, replace

* no cfn, has id
  
use cfs`yr', clear 
drop if itm_fl_exp_yn=="Y"  /* Drop exports */
gen cfn=substr(id,1,10)
gen vs=itm_fd_shipmt_val*bwhiqj_cfs /*$ as units*/
gen weight=itm_fe_shipmt_wght*bwhiqj_cfs 
gen orig_zip=substr(zip,1,5)
destring orig_zip, force replace
bys id: egen orig_zip_temp=mode(orig_zip)
replace orig_zip=orig_zip_temp if orig_zip_temp~=. 
drop orig_zip_temp
keep cfn vs dest_zip orig_zip weight
save cfs`yr'cleaned, replace

use lbd`preyr'`yr'naics, clear
gsort yr cfn -pay
by yr cfn: keep if _n==1
bys cfn: gen ni=_N
drop if ni>=2 & yr!=2007
drop ni
merge 1:m cfn using cfs`yr'cleaned
drop if _merge==1
drop _merge
save cfs`yr'cleaned, replace

gen ind=1 if lbdnum==""
replace ind=0 if ind==.
count if lbdnum==""
local mis=r(N)
count if lbdnum!=""
local mat=r(N)
dis `mis'/(`mis'+`mat')
  
drop if lbdnum==""
destring firmid lbdnum dest_zip, force replace

/* Here, we compute the destination zip codes for each upstream industry, 
  then the sending establishments for each upstream industry. After this, we 
  will loop over industries, forming Cartesian products of the sending 
  establishments (contained in temp_zip1) and the destination ZIP codes (contained
  in temp_zip) .

  */
  
preserve
	collapse (mean) vs, by(dest_zip `indvar') 
	keep dest_zip `indvar'
	count if dest_zip~=.
	save temp_zip1, replace
restore
collapse (mean) vs, by(lbdnum `indvar')
keep lbdnum `indvar'
egen group=group(`indvar')
save temp_zip, replace

use temp_zip, clear
su group, meanonly
	use if group==1 using temp_zip, clear
	drop group
	save temp_zip2, replace
	drop if _n~=1
	drop lbdnum
	merge 1:m `indvar' using temp_zip1, keep(3)
	drop _merge
	joinby `indvar' using temp_zip2
	save zipbank_`yr',replace
use temp_zip, clear
/* here we are looping over the different (upstream) industries within our sample, 
computing the Cartesian product of the sending establishments and destination ZIP codes */
su group, meanonly	
forvalues i = 2/`r(max)'{
	use if group==`i' using temp_zip, clear
	drop group
	save temp_zip2, replace
	drop if _n~=1
	drop lbdnum
	merge 1:m `indvar' using temp_zip1, keep(3)
	drop _merge
	joinby `indvar' using temp_zip2
	append using zipbank_`yr'
	save zipbank_`yr', replace
}

/* Get number of same firm establishments at destination */
use lbd`preyr'`yr'naics, clear
drop if lbdnum==""
keep lbdnum firmid zip yr pay
gen dest_zip=substr(zip,1,5)
destring dest_zip firmid, force replace
drop zip
gen firmtotest=1
ren pay firmsumpay
collapse (sum) firmtotest firmsumpay, by(firmid dest_zip yr)
drop if yr!=`yr' | firmid==.
save firmtotest`yr', replace

/* Get number of ds plants and total number of plants at dest_zip*/
use lbd`preyr'`yr'naics, clear
drop if yr!=`yr'  | firmid==""
keep zip `indvar' lbdnum pay
rename `indvar' `indvardown'
drop if lbdnum==""
gen dest_zip=substr(zip,1,5)
destring dest_zip, force replace
sort dest_zip `indvardown'
by dest_zip `indvardown': gen num_est=_n
by dest_zip `indvardown': egen sum_pay=sum(pay) 
by dest_zip: gen tot_num_est=_N
collapse (max) num_est sum_pay (mean) tot_num_est, by(dest_zip `indvardown')
save num_est_by_`indvardown'_`yr', replace

insheet using naicsTable`pctlink'.csv, clear
joinby `indvardown' using num_est_by_`indvardown'_`yr'
collapse (sum) sum_pay  num_est (mean) tot_num_est, by(dest_zip `indvar')
save num_est_`yr', replace

/*get vs from cfs*/
use cfs`yr'cleaned, clear
drop if lbdnum==""
destring firmid lbdnum dest_zip, force replace
collapse (sum) vs, by(lbdnum `indvar' dest_zip)
merge 1:1 lbdnum `indvar' dest_zip using zipbank_`yr'
/*unmatched due to missing naicsup*/
drop if _merge==1
drop _merge
save zipbank_vs_`yr', replace

use cfs`yr'cleaned, clear
drop if lbdnum==""
drop firmid `indvar' bestsic bestnaics
save temp, replace
use lbd`preyr'`yr'naics, clear
gsort yr lbdnum -pay
by yr lbdnum: keep if _n==1
bys lbdnum: gen ni=_N
drop if ni>=2 & yr!=`yr'
drop ni
merge 1:m lbdnum using temp
drop if _merge~=3
drop _merge
keep lbdnum firmid* county zip orig_zip mu emp pay naicsup 
gsort lbdnum -pay
by lbdnum: keep if _n==1
destring lbdnum firmid, force replace
merge 1:m lbdnum using zipbank_vs_`yr'
drop if _merge~=3
/*unmatched due to missing naicsup*/
drop _merge
merge m:1 `indvar' dest_zip using num_est_`yr'
replace num_est=0 if _merge==1
replace sum_pay=0 if _merge==1
/*unmatched due to not in CFS*/
drop if _merge==2
drop _merge

merge m:1 firmid dest_zip using firmtotest`yr'
drop if _merge==2
drop _merge
save temp_out, replace

use lbd`preyr'`yr'naics, clear
drop if yr!=`yr' |  firmid==""
keep firmid* lbdnum zip `indvar' pay
gen dest_zip=substr(zip,1,5)
destring firmid* lbdnum dest_zip, force replace
rename `indvar' `indvardown'
ren firmid2002 firmid2002t
merge 1:1 lbdnum using im_`yr'plants_4
drop firmid2002
ren firmid2002t firmid2002
gen merge_ind=1 if _merge==3
gen double firm_at_ma=.
forvalues x=2002/2006 {
  replace firm_at_ma=firmid`x' if mrgeyr==`x'
}
drop if dest_zip==. | `indvardown'==.
drop _merge
sort dest_zip `indvardown' firmid firm_at_ma

by dest_zip `indvardown' firmid: gen firmnumest=_N
by dest_zip `indvardown' firmid: egen firmsumpay=sum(pay)
by dest_zip `indvardown' firmid firm_at_ma: egen im1=sum(imboth_1)
by dest_zip `indvardown' firmid firm_at_ma: egen im2=sum(imboth_2)
by dest_zip `indvardown' firmid firm_at_ma: egen im3=sum(imboth_3)
by dest_zip `indvardown' firmid firm_at_ma: egen m_num=sum(merger_all)
by dest_zip `indvardown' firmid firm_at_ma: egen im1pay=sum(imboth_1*pay)
by dest_zip `indvardown' firmid firm_at_ma: egen im2pay=sum(imboth_2*pay)
by dest_zip `indvardown' firmid firm_at_ma: egen im3pay=sum(imboth_3*pay)
by dest_zip `indvardown' firmid firm_at_ma: egen m_numpay=sum(merger_all*pay)

collapse (max) firmnumest firmsumpay im1-m_numpay  mrgeyr, by(dest_zip `indvardown' firmid firm_at_ma)
save firmnumest_`yr', replace

insheet using naicsTable`pctlink'.csv, clear
joinby `indvardown' using firmnumest_`yr'
collapse (sum) firmsumpay firmnumest im1-m_numpay (max) mrgeyr, by(dest_zip `indvar' firmid firm_at_ma)
save firm_num_est_`yr', replace

use  temp_out, replace
joinby `indvar' dest_zip firmid using firm_num_est_`yr', unmatched(master)
save gen_firmfrac, replace

use gen_firmfrac, clear
sort lbdnum dest_zip firm_at_ma vs

foreach var of varlist firmtotest firmsumpay firmnumest {
   by lbdnum dest_zip: egen t=max(`var')
   replace `var'=t
   drop t
}

drop zip
compress

gen firmnumest2=firmnumest
replace firmnumest2=0 if dest_zip==orig_zip  
gen firmfrac = firmnumest/num_est if num_est~=0
gen firmfrac2 = firmnumest2/num_est if num_est~=0
gen firmfrac3 = firmsumpay/sum_pay if sum_pay~=0
/* We use firmfrac, below, but have played around with the other definitions 
   of the same-firm fraction.
*/
replace firmfrac=0 if num_est==0
replace firmfrac=0 if _merge==1
replace firmfrac2=0 if num_est==0
replace firmfrac2=0 if _merge==1
replace firmfrac3=0 if sum_pay==0
replace firmfrac3=0 if _merge==1
gen firmfrac_diff_zip=firmfrac
replace firmfrac_diff_zip=firmfrac_diff_zip-(1/num_est) if dest_zip == orig_zip & num_est~=0
count if firmfrac==1
drop _merge
merge m:1 lbdnum using imall_0207_4, keepusing(mrgeyr outbuyer1 outownf1 firmid2002 ownf) 
gen merge_ind=1 if _merge==3
drop if _merge==2
drop _merge

/* Get distances */

ren dest_zip zipcode 
merge n:1 zipcode using zipcodes,  keep(1 3)
ren zipcode dest_zip
ren longit dest_longit
ren latit dest_latit
drop _merge
ren orig_zip zipcode
merge n:1 zipcode using zipcodes, keep(1 3)
ren zipcode orig_zip
ren longit orig_longit
ren latit orig_latit

compress

/* There are a small number of zip codes with 
missing longitude/latitude.  Get the 
zip code that is "closest" (according to the number of the zip code)
and fillin the longitude and latitude from this close zip code */
gen nomiss=1-(orig_latit==.)
sort nomiss orig_zip
by nomiss orig_zip: gen idTemp=1 if _n==1
replace idTemp=sum(idTemp)
qui sum idTemp if nomiss==0, de
local rmax=r(max)
forvalues xq=1/`rmax' {
   qui sum orig_zip if idTemp==`xq'
   local oo=r(mean)  /* Get the identity of the zip code */
   qui sum orig_zip if orig_zip>`oo' & nomiss==1 /* Get the identity of the "next" zip code which doesn't have a missing value */
   local oo2=r(min)
   qui sum orig_latit if orig_zip==`oo2'
   replace orig_latit=r(mean) if orig_latit==. & orig_zip==`oo'
   qui sum orig_longit if orig_zip==`oo2'
   replace orig_longit=r(mean) if orig_longit==. & orig_zip==`oo'
}

drop nomiss idTemp

gen nomiss=1-(dest_latit==.)
sort nomiss dest_zip
by nomiss dest_zip: gen idTemp=1 if _n==1
replace idTemp=sum(idTemp)
qui sum idTemp if nomiss==0, de
local rmax=r(max)
forvalues xq=1/`rmax' {
   qui sum dest_zip if idTemp==`xq'
   local oo=r(mean)  /* Get the identity of the zip code */
   qui sum dest_zip if dest_zip>`oo' & nomiss==1 /* Get the identity of the "next" zip code which doesn't have a missing value */
   local oo2=r(min)
   qui sum dest_latit if dest_zip==`oo2'
   replace dest_latit=r(mean) if dest_latit==. & dest_zip==`oo'
   qui sum dest_longit if dest_zip==`oo2'
   replace dest_longit=r(mean) if dest_longit==. & dest_zip==`oo'
}
drop nomiss idTemp
drop if dest_latit==. /* There shouldn't be any of these zip codes, but just in case */
gen mileage2=sqrt( 69.31^2 * (dest_latit-orig_latit)^2 + 53.0^2 * (dest_longit-orig_longit)^2)

gen latit_1=orig_latit*atan(1)/45
gen latit_2=dest_latit*atan(1)/45
gen longit_1=orig_longit*atan(1)/45
gen longit_2=dest_longit*atan(1)/45
gen temp_mileage=(sin((latit_2-latit_1)/2))^2+cos(latit_1)*cos(latit_2)*(sin((longit_2-longit_1)/2))^2  
gen  mileage3=3957*2*atan2(sqrt(temp_mileage), sqrt(1-temp_mileage))
drop temp_mileage latit_1-longit_2
drop _merge
gen lmileage2=log(mileage3)
replace mileage2=mileage3

/* What if orig_zip==dest_zip (and mileage==0)?  Just set the value to 0 and then
 (in regressions in which we keep same_zip==1 observations)
  add an indicator in the regression right-hand side*/
qui sum lmileage2
replace lmileage2=r(min) if orig_zip==dest_zip | mileage2==0 
compress 
count if orig_zip==dest_zip
count if lmileage2==.
gen same_zip=0
replace same_zip=1 if orig_zip==dest_zip

destring ownf, replace force
tabstat im1pay im1 im2pay im2 , stats(sum)
forvalues x=1/3 {
  replace im`x'pay=0 if ownf==firm_at_ma & im`x'>0 & im`x'<.
  replace im`x'=0 if  ownf==firm_at_ma & im`x'>0 & im`x'<.   
}
sort lbdnum dest_zip
foreach var of varlist im1 im2 im3  im1pay im2pay im3pay  {
   by lbdnum dest_zip: egen temp_var=sum(`var')
   replace `var'=temp_var
   drop temp_var
}
forvalues x=1/3 {
  gen im`x'frac=im`x'/num_est
  replace im`x'frac=0 if im`x'frac==.
}
by lbdnum dest_zip: keep if _n==1
tabstat im1pay im1 im2pay im2 , stats(sum)

drop outbuyer1-ownf firmid2002-firmid2007

gen zero_vs_ind=1 if vs==. /*Generate an indicator for zero value of shipment*/
replace zero_vs_ind=0 if zero_vs_ind==.
replace vs=0 if vs==. /*replace 0 shipment with 1 and then take the log*/
gen shipment=vs
replace vs=1 if vs==0 
gen log_shipment = log(vs)
gen wholesale=(naicsup>=420000 & naicsup<460000)  /* note this includes a few wholesale-like retail establishments, like mail-order catalogues*/

bys orig_zip `indvar': egen tot_ship_o=sum(shipment)
bys dest_zip `indvar': egen tot_ship_d=sum(shipment)
bys `indvar': egen tot_ship=sum(shipment)

gen theta_i = tot_ship_o/tot_ship
gen theta_d = tot_ship_d/tot_ship
gen absorb_ratio=shipment/tot_ship_d

sum shipment, de
sum tot_ship_d, de
sum absorb_ratio, de

gen lmileageA=(exp(lmileage2)<=50 )
gen lmileageB=(exp(lmileage2)>50 & exp(lmileage2)<=100 )
gen lmileageC=(exp(lmileage2)>100 & exp(lmileage2)<=200 )
gen lmileageD=(exp(lmileage2)>200 & exp(lmileage2)<=500  )
gen lmileageE=(exp(lmileage2)>1000)

forvalues x=1/3 {
  gen im`x'payfrac= im`x'pay/sum_pay
  replace  im`x'payfrac=0 if  im`x'payfrac==.
}

if `pctlink'~=10  {
  save analysis_data2007_`pctlink', replace
}

if `pctlink'==10  {
  save analysis_data2007, replace
}

  
foreach v in  lmileage2 firmfrac firmfrac3 lmileageA lmileageB lmileageC lmileageD lmileageE  {
        di "`v'"
        egen MRDIS1`v' = sum(theta_i*`v'), by(dest_zip `indvar')
	egen MRDIS2`v' = sum(theta_d*`v'), by(orig_zip `indvar')
	egen MRDIS3`v' = sum(theta_i*theta_d*`v'), by(`indvar')
	gen `v'_mrW=`v'-(MRDIS1`v'+MRDIS2`v'-MRDIS3`v')
	drop MR* 
}


foreach v in  lmileage2 firmfrac firmfrac3 im1frac im2frac im3frac im3payfrac im1payfrac im2payfrac lmileageA lmileageB lmileageC lmileageD lmileageE   {
        di "`v'"
	egen mean1 = mean(`v'), by(dest_zip `indvar')
	egen mean2 = mean(`v'), by(orig_zip `indvar')
	egen mean3 = mean(`v'), by(`indvar')
	gen `v'_mr=`v'-mean1-mean2+mean3
	drop mean1 mean2 mean3
}

ren firmfrac3 firmpayfrac
ren firmfrac3_mr firmpayfrac_mr
ren firmfrac3_mrW firmpayfrac_mrW

compress

label var im1 "number of im establishments; top 1 segment"
label var im2 "number of im establishments; top 2 segments"
label var im3 "number of im establishments; top 3 segments"
label var im1pay "payroll of im establishments; top 1 segment : fraction of ds payroll"
label var im2pay "payroll of im establishments; top 2 segments : fraction of ds payroll"
label var im3pay "payroll of im establishments; top 3 segments : fraction of ds payroll"
label var im1frac "payroll of im establishments; top 1 segment "
label var im2frac "payroll of im establishments; top 2 segments"
label var im3frac "payroll of im establishments; top 3 segments"
label var naicsup "bestnaics of sending establishment"
label var emp "lbd employment of sending establishment"
label var pay "lbd payroll of sending establishment"
label var county "county of sending establishment"
label var orig_zip "zip code of sending establishment"
label var dest_zip "zip code of receiving establishment"
label var firmfrac "same firm share"
label var firmpayfrac "same firm share, weighted by establisments payroll"
label var firmtotest "total number of same-firm establishments in destination"
label var vs "value of shipments in CFS"
label var same_zip "indicator: is dest_zip==orig_zip"
label var mileage2 "mileage according to haversine formula"
label var lmileage2 "log mileage2; minimum distance if same_zip==1"

if `pctlink'~=10  {
  save analysis_data2007_`pctlink', replace
}

if `pctlink'==10  {
  save analysis_data2007, replace
}

	if `compile_panel'==1{
	local yr=2002
	local preyr=`yr'-1

	/*Fix that some cfn are missing in the current year,
	use the previous year to match with cfs*/

	use lbd`preyr', clear
	append using lbd`yr'
	gen naicsup=substr(bestnaics,1,6)
	destring naicsup, replace
	sort lbdnum yr
	foreach v in bestnaics lbdnum cfn county zip{
		by lbdnum: replace `v'=`v'[_n-1] if `v'=="" & `v'[_n-1]!=""
	}
	foreach v in  mu emp pay naicsup {
		by lbdnum: replace `v'=`v'[_n-1] if `v'==. & `v'[_n-1]!=.
	}
	save lbd`preyr'`yr'naics, replace

	use cfs`yr', clear 
	drop if export=="Y"
	rename cfn id
	gen cfn=substr(id,1,10)
	*gen vs=itm_fd_shipmt_val*bwhiqj_cfs /*$ as units*/
	*gen weight=itm_fe_shipmt_wght*bwhiqj_cfs 
	*gen orig_zip=substr(zip,1,5)
	destring orig_zip, force replace
	bys id: egen orig_zip_temp=mode(orig_zip)
	replace orig_zip=orig_zip_temp if orig_zip_temp~=. 
	drop orig_zip_temp
	keep cfn vs dest_zip orig_zip weight
	save cfs`yr'cleaned, replace

	use lbd`preyr'`yr'naics, clear
	gsort yr cfn -pay
	by yr cfn: keep if _n==1
	bys cfn: gen ni=_N
	drop if ni>=2 & yr!=`yr'
	drop ni
	merge 1:m cfn using cfs`yr'cleaned
	drop if _merge==1
	drop _merge
	save cfs`yr'cleaned, replace

	/*get vs from cfs*/
	use cfs`yr'cleaned, clear
	drop if lbdnum==""
	destring firmid lbdnum dest_zip, force replace
	collapse (sum) vs, by(lbdnum `indvar' dest_zip)
	merge 1:1 lbdnum `indvar' dest_zip using zipbank_2007
	/*unmatched due to missing naicsup*/
	drop if _merge==1
	gen cfs2002=1 if _merge==3
	drop _merge
	save zipbank_vs_`yr', replace

	use lbd`preyr'`yr'naics, clear
	gsort yr lbdnum -pay
	by yr lbdnum: keep if _n==1
	bys lbdnum: gen ni=_N
	drop if ni>=2 & yr!=`yr'
	drop ni
	gen orig_zip = substr(zip,1,5)
	destring orig_zip lbdnum, force replace
	drop if lbdnum==.
	merge 1:m lbdnum using zipbank_vs_`yr'
	drop if _merge==1
	drop _merge
	keep lbdnum `indvar' dest_zip orig_zip vs cfs2002
	save cfs`yr'cleaned1, replace

	gen zero_vs_ind=1 if vs==. /*Generate an indicator for zero value of shipment*/
	replace zero_vs_ind=0 if zero_vs_ind==.
	replace vs=0 if vs==. /*replace 0 shipment with 1 and then take the log*/
	gen shipment=vs
	replace vs=1 if vs==0 
	gen log_shipment = log(vs)
	bys orig_zip `indvar': egen tot_ship_o=sum(shipment)
	bys dest_zip `indvar': egen tot_ship_d=sum(shipment)
	bys `indvar': egen tot_ship=sum(shipment)

	gen theta_i = tot_ship_o/tot_ship
	gen theta_d = tot_ship_d/tot_ship
	gen absorb_ratio_`yr'=shipment/tot_ship_d
	rename vs vs2002
	keep lbdnum dest_zip absorb_ratio_`yr' vs2002 cfs2002
	save ab`yr', replace
	if `pctlink'~=10  {
		merge 1:1 lbdnum dest_zip using analysis_data2007_`pctlink'
		replace absorb_ratio_`yr'=0 if _merge==2
		rename _merge nopriortrade
		save analysis_data2007_`pctlink'_panel, replace
		}
	if `pctlink'==10  {
		merge 1:1 lbdnum dest_zip using analysis_data2007
		replace absorb_ratio_`yr'=0 if _merge==2
		rename _merge nopriortrade
		save analysis_data2007_panel, replace
		}
label var absorb_ratio_`yr' "Market Share in `yr'"

use cfs2002cleaned, clear
collapse (sum) vs, by(lbdnum)
save temp2002, replace

use cfs2007cleaned, clear
collapse (sum) vs, by(lbdnum)
merge 1:1 lbdnum using temp2002
drop if _merge!=3
drop _merge
keep lbdnum
destring lbdnum, force replace
merge 1:n lbdnum using analysis_data2007_panel
drop if _merge!=3
drop _merge
save panel_both0207, replace

	}
}
/* end of compile rest loop */

